{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import csv\n",
    "\n",
    "split = [ 'train', 'valid', 'test' ]\n",
    "\n",
    "Reviews = []  \n",
    "for source in split:\n",
    "    with open( 'data/yelp/' + source + '.csv', 'r' ) as f:\n",
    "        Reader = csv.reader( f, delimiter=',', quoting=csv.QUOTE_MINIMAL )\n",
    "        for record in Reader:\n",
    "            Reviews.append( record )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false,
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "# Tokenization and Stemming\n",
    "# Remove punctuation and tokenize\n",
    "from nltk.tokenize import word_tokenize, sent_tokenize\n",
    "from nltk.stem import WordNetLemmatizer\n",
    "import string\n",
    "import nltk\n",
    "import re\n",
    "\n",
    "# NO punctuation in text\n",
    "wordnet_lemmatizer = WordNetLemmatizer()\n",
    "punctuations = list( string.punctuation )\n",
    "\n",
    "Data = []\n",
    "for review in Reviews:\n",
    "    test_review = review[ 0 ].decode( 'utf-8' )\n",
    "    helpfulness = review[ 1 ]\n",
    "\n",
    "    # remove hyperlinks in review\n",
    "    review = re.sub( r'<a.*/a>', ' ', test_review )\n",
    "    # deal with punctuations\n",
    "    for ch in punctuations:\n",
    "        review = review.replace( ch, ch + ' ' )\n",
    "    \n",
    "    tokens = [ word for sent in sent_tokenize( review ) for word in word_tokenize( sent ) ]\n",
    "    tagged_tokens = nltk.pos_tag( tokens )\n",
    "\n",
    "    stemmed = []\n",
    "    for pair in tagged_tokens:\n",
    "        # convert verb to its original form\n",
    "        if pair[ 1 ][:2] == 'VB':\n",
    "            token = wordnet_lemmatizer.lemmatize( pair[ 0 ], 'v' )\n",
    "        else:\n",
    "            token = wordnet_lemmatizer.lemmatize( pair[ 0 ] )\n",
    "            \n",
    "        stemmed.append( token )\n",
    "\n",
    "    length = len( stemmed )\n",
    "    record = [ ' '.join( stemmed ), helpfulness, str( length ) ]\n",
    "    Data.append( record )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "\n",
    "data_size = len( Data )\n",
    "np.random.shuffle( Data )\n",
    "\n",
    "Data = [ [record[0], record[1], int( record[2]) ] for record in Data ]\n",
    "\n",
    "# Spilt into train, valid, test - 60%, 20%, 20%\n",
    "train_size = int( data_size * 0.6 )\n",
    "valid_size = int( data_size * 0.2 )\n",
    "\n",
    "train = Data[ :train_size ]\n",
    "valid = Data[ train_size:train_size + valid_size ]\n",
    "test = Data[ train_size + valid_size:]\n",
    "\n",
    "# Sort the reviews in descending order\n",
    "train.sort( key=lambda x:x[2], reverse=True )\n",
    "valid.sort( key=lambda x:x[2], reverse=True )\n",
    "test.sort( key=lambda x:x[2], reverse=True )\n",
    "\n",
    "# Save into different files\n",
    "split = [ 'train', 'valid', 'test' ]\n",
    "for data_src in split:\n",
    "    with open(  'data/' + data_src + '.csv', 'w+' ) as f:\n",
    "        Writer = csv.writer( f, delimiter=',', quoting=csv.QUOTE_MINIMAL )\n",
    "        \n",
    "        if data_src == 'train':\n",
    "            Records = train\n",
    "        elif data_src == 'valid':\n",
    "            Records = valid\n",
    "        else:\n",
    "            Records = test\n",
    "            \n",
    "        Records = [ [record[0], record[1], str( record[2]) ] for record in Records ]\n",
    "            \n",
    "        for record in Records:\n",
    "            record = [ item.encode('utf-8') for item in record ]\n",
    "            Writer.writerow( record )"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Glove Word Vector Configuration"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Load pretrained GloVe word vectors - 200d\n",
    "Word2vec_dic = {}\n",
    "with open( 'word2vecs/glove.6B/glove.6B.200d.txt', 'r' ) as f:\n",
    "    for line in f:\n",
    "        line = line.split()\n",
    "        word = line[0]\n",
    "        vector = line[1:]\n",
    "        vector = [ float( item ) for item in vector ]\n",
    "        Word2vec_dic[ word ] = vector"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from data import *\n",
    "\n",
    "path = './data'\n",
    "corpus = Corpus( path )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "cuda = True\n",
    "emsize = 200\n",
    "ntokens = len( corpus.dictionary )\n",
    "emb_matrix = torch.FloatTensor( ntokens, emsize )\n",
    "word_idx_list = []\n",
    "initrange = 0.1\n",
    "for idx in range( ntokens ):\n",
    "    try:\n",
    "        vec = Word2vec_dic[ corpus.dictionary.idx2word[ idx ] ]\n",
    "        emb_matrix[ idx ] = torch.FloatTensor( vec )\n",
    "    except:\n",
    "        word_idx_list.append( idx )\n",
    "        vec = torch.FloatTensor( 1, emsize )\n",
    "        vec.uniform_( -initrange, initrange )\n",
    "        emb_matrix[ idx ] = vec\n",
    "\n",
    "# Get Index of Word Embedding that need to be updated during training\n",
    "if cuda:\n",
    "    word_idx_list = torch.cuda.LongTensor( word_idx_list )\n",
    "else:\n",
    "    word_idx_list = torch.LongTensor( word_idx_list )\n",
    "    \n",
    "torch.save( emb_matrix, 'data/emb_matrix.pt' )\n",
    "torch.save( word_idx_list, 'data/word_idx_list.pt' )"
   ]
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python [conda root]",
   "language": "python",
   "name": "conda-root-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
